import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
dfs = pd.read_excel("input_data.xlsx")
dfs.head(20)
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | -verified user details.(employee# & manager name)_x000D_\n-checked the user name in ad and reset the password._x000D_\n-advised the user to login and check._x000D_\n-caller confirmed that he was able to login._x000D_\n-issue resolved. | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | _x000D_\n_x000D_\nreceived from: hmjdrvpb.komuaywn@gmail.com_x000D_\n_x000D_\nhello team,_x000D_\n_x000D_\nmy meetings/skype meetings etc are not appearing in my outlook calendar, can somebody please advise how to correct this?_x000D_\n_x000D_\nkind | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log in to vpn | _x000D_\n_x000D_\nreceived from: eylqgodm.ybqkwiam@gmail.com_x000D_\n_x000D_\nhi_x000D_\n_x000D_\ni cannot log on to vpn_x000D_\n_x000D_\nbest | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable to access hr_tool page | unable to access hr_tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
| 5 | unable to log in to engineering tool and skype | unable to log in to engineering tool and skype | eflahbxn ltdgrvkz | GRP_0 |
| 6 | event: critical:HostName_221.company.com the value of mountpoint threshold for /oracle/SID_37/erpdata21/ | event: critical:HostName_221.company.com the value of mountpoint threshold for /oracle/SID_37/erpdata21/sr3psa1d_7/sr3psa1d.data7,perpsr3psa1d,4524 is 98 | jyoqwxhz clhxsoqy | GRP_1 |
| 7 | ticket_no1550391- employment status - new non-employee [enter user's name] | ticket_no1550391- employment status - new non-employee [enter user's name] | eqzibjhw ymebpoih | GRP_0 |
| 8 | unable to disable add ins on outlook | unable to disable add ins on outlook | mdbegvct dbvichlg | GRP_0 |
| 9 | ticket update on inplant_874773 | ticket update on inplant_874773 | fumkcsji sarmtlhy | GRP_0 |
| 10 | engineering tool says not connected and unable to submit reports | engineering tool says not connected and unable to submit reports | badgknqs xwelumfz | GRP_0 |
| 11 | hr_tool site not loading page correctly | hr_tool site not loading page correctly | dcqsolkx kmsijcuz | GRP_0 |
| 12 | unable to login to hr_tool to sgxqsuojr xwbesorf cards | unable to login to hr_tool to sgxqsuojr xwbesorf cards | oblekmrw qltgvspb | GRP_0 |
| 13 | user wants to reset the password | user wants to reset the password | iftldbmu fujslwby | GRP_0 |
| 14 | unable to open payslips | unable to open payslips | epwyvjsz najukwho | GRP_0 |
| 15 | ticket update on inplant_874743 | ticket update on inplant_874743 | fumkcsji sarmtlhy | GRP_0 |
| 16 | unable to login to company vpn | \n\nreceived from: xyz@company.com\n\nhi,\n\ni am unable to login to company vpn website, trying to open a new session using the below link, but not able to get through pls help urgently as we are working from home tomorrow due to month end closing. | chobktqj qdamxfuc | GRP_0 |
| 17 | when undocking pc , screen will not come back | when undocking pc , screen will not come back | sigfdwcj reofwzlm | GRP_3 |
| 18 | erp SID_34 account locked | erp SID_34 account locked | nqdyowsm yqerwtna | GRP_0 |
| 19 | unable to sign into vpn | unable to sign into vpn | ftsqkvre bqzrupic | GRP_0 |
dfs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8500 entries, 0 to 8499 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Short description 8492 non-null object 1 Description 8499 non-null object 2 Caller 8500 non-null object 3 Assignment group 8500 non-null object dtypes: object(4) memory usage: 265.8+ KB
dfs["Assignment group"].nunique()
74
dfs["Assignment group"].value_counts()
GRP_0 3976 GRP_8 661 GRP_24 289 GRP_12 257 GRP_9 252 GRP_2 241 GRP_19 215 GRP_3 200 GRP_6 184 GRP_13 145 GRP_10 140 GRP_5 129 GRP_14 118 GRP_25 116 GRP_33 107 GRP_4 100 GRP_29 97 GRP_18 88 GRP_16 85 GRP_17 81 GRP_31 69 GRP_7 68 GRP_34 62 GRP_26 56 GRP_40 45 GRP_28 44 GRP_41 40 GRP_30 39 GRP_15 39 GRP_42 37 GRP_20 36 GRP_45 35 GRP_22 31 GRP_1 31 GRP_11 30 GRP_21 29 GRP_47 27 GRP_62 25 GRP_48 25 GRP_23 25 GRP_60 20 GRP_39 19 GRP_27 18 GRP_37 16 GRP_44 15 GRP_36 15 GRP_50 14 GRP_65 11 GRP_53 11 GRP_52 9 GRP_55 8 GRP_51 8 GRP_49 6 GRP_46 6 GRP_59 6 GRP_43 5 GRP_66 4 GRP_32 4 GRP_68 3 GRP_63 3 GRP_38 3 GRP_58 3 GRP_56 3 GRP_57 2 GRP_54 2 GRP_69 2 GRP_71 2 GRP_72 2 GRP_61 1 GRP_64 1 GRP_67 1 GRP_35 1 GRP_70 1 GRP_73 1 Name: Assignment group, dtype: int64
plt.subplots(figsize = (20,5))
plt.xticks(rotation=90)
plt.plot(dfs["Assignment group"].value_counts())
plt.xlabel("Assignment group")
plt.ylabel("count")
plt.grid()
plt.show()
dfs.isnull().sum()
Short description 8 Description 1 Caller 0 Assignment group 0 dtype: int64
dfs[dfs.isnull().any(axis=1)]
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 2604 | NaN | _x000D_\n_x000D_\nreceived from: ohdrnswl.rezuibdt@gmail.com_x000D_\n_x000D_\nhi,_x000D_\n_x000D_\n_x000D_\n_x000D_\nlink is not working. kindly resolve yhe issue on urgent basis._x000D_\n_x000D_\n_x000D_\n_x000D_\nbest | ohdrnswl rezuibdt | GRP_34 |
| 3383 | NaN | _x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the portal._x000D_\n-issue resolved. | qftpazns fxpnytmk | GRP_0 |
| 3906 | NaN | -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. | awpcmsey ctdiuqwe | GRP_0 |
| 3910 | NaN | -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. | rhwsmefo tvphyura | GRP_0 |
| 3915 | NaN | -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. | hxripljo efzounig | GRP_0 |
| 3921 | NaN | -user unable tologin to vpn._x000D_\n-connected to the user system using teamviewer._x000D_\n-help the user login to the company vpn using the vpn company vpn link._x000D_\n-issue resolved. | cziadygo veiosxby | GRP_0 |
| 3924 | NaN | name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:microsoft internet explorer\nemail:wvqgbdhm.fwchqjor@gmail.com\ncustomer number:\ntelephone:-not available\nsummary:can't get into vpn - need to be on at 4:30 est and it wont' happen please help aerp! | wvqgbdhm fwchqjor | GRP_0 |
| 4341 | NaN | _x000D_\n_x000D_\nreceived from: eqmuniov.ehxkcbgj@gmail.com_x000D_\n_x000D_\ngood morning,_x000D_\n_x000D_\nwhen trying to log on to erp i get this error below._x000D_\nplease help urgently as i can not process customer order._x000D_\n_x000D_\n[cid:image001.png@01d20f2e.751db880]_x000D_\n_x000D_\n_x000D_\n | eqmuniov ehxkcbgj | GRP_0 |
| 4395 | i am locked out of skype | NaN | viyglzfo ajtfzpkb | GRP_0 |
duplicate_df=dfs[dfs.duplicated(subset = None, keep = 'first')]
duplicate_df.shape
(83, 4)
duplicate_df
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 51 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 229 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 493 | ticket update on inplant_872730 | ticket update on inplant_872730 | fumkcsji sarmtlhy | GRP_0 |
| 512 | blank call //gso | blank call //gso | rbozivdq gmlhrtvp | GRP_0 |
| 667 | job bkbackup_tool_powder_prod_full failed in job_scheduler at: 10/22/2016 18:33:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob bkbackup_tool_powder_prod_full failed in job_scheduler at: 10/22/2016 18:33:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 724 | blank call | blank call | rbozivdq gmlhrtvp | GRP_0 |
| 1064 | job Job_1967d failed in job_scheduler at: 10/19/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1967d failed in job_scheduler at: 10/19/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 1125 | blank call | blank call | rbozivdq gmlhrtvp | GRP_0 |
| 1744 | phone issue | phone issue | gzjtweph mnslwfqv | GRP_0 |
| 1851 | reset passwords for fylrosuk kedgmiul using password_management_tool password reset. | the | fylrosuk kedgmiul | GRP_17 |
| 1982 | call came and got disconnected | call came and got disconnected | rbozivdq gmlhrtvp | GRP_0 |
| 2000 | job Job_549 failed in job_scheduler at: 10/07/2016 23:05:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_549 failed in job_scheduler at: 10/07/2016 23:05:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 2061 | blank call // loud noise // gso | blank call // loud noise // gso | rbozivdq gmlhrtvp | GRP_0 |
| 2141 | blank call | blank call | rbozivdq gmlhrtvp | GRP_0 |
| 2533 | reset passwords for qwsjptlo hnlasbed using password_management_tool password reset. | the | goaxzsql qpjnbgsa | GRP_17 |
| 2554 | reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. | the | bxeagsmt zrwdgsco | GRP_17 |
| 2683 | ticket update | ticket update | pbhmwqtz wqlbudjx | GRP_0 |
| 2714 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 2720 | german call | german call | ayrhcfxi zartupsw | GRP_0 |
| 2789 | blank call | blank call | pwkrlqbc zslqfmka | GRP_0 |
| 2875 | blank call | blank call | pwkrlqbc zslqfmka | GRP_0 |
| 2876 | blank call | blank call | pwkrlqbc zslqfmka | GRP_0 |
| 3085 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 3219 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 3619 | call came and got disconnected | call came and got disconnected | rbozivdq gmlhrtvp | GRP_0 |
| 3637 | blank call | blank call | fumkcsji sarmtlhy | GRP_0 |
| 3647 | ç”å¤: ç”å¤: order products online problem | _x000D_\n_x000D_\nreceived from: fkdazsmi.yecbrofv@gmail.com_x000D_\n_x000D_\nhello,_x000D_\ni will ask your help if i can't solve it with the help of my boss tomorrow. | fkdazsmi yecbrofv | GRP_0 |
| 3693 | reset passwords for mvhcoqed konjdmwq using password_management_tool password reset. | the | mvhcoqed konjdmwq | GRP_17 |
| 3908 | vpn not working- vpn.company.com link is giving error | vpn not working- vpn.company.com link is giving error | kailyenh zfyvkopr | GRP_0 |
| 4094 | job Job_2883 failed in job_scheduler at: 09/18/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_2883 failed in job_scheduler at: 09/18/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 4229 | not able to access -inq industrial (-inq.industrial@company.com) | _x000D_\n_x000D_\nreceived from: muqdlobv.qflsdahg@gmail.com_x000D_\n_x000D_\nteam,_x000D_\n_x000D_\ni am not able access to -inq industrial (-inq.industrial@company.com<mailto:-inq.industrial@company.com>) for which we receive enquiries about special products._x000D_\nfrom past one week above mentioned shared email box is not functioning for me, whereas other members are able to access the same._x000D_\nso i kindly request you to take it upon high priority and confirm back._x000D_\n_x000D_\n | muqdlobv qflsdahg | GRP_0 |
| 4273 | blank call | blank call | pwkrlqbc zslqfmka | GRP_0 |
| 4303 | call for ecwtrjnq jpecxuty | call for ecwtrjnq jpecxuty | olckhmvx pcqobjnd | GRP_0 |
| 4361 | account locked in ad | account locked in ad | gvsabjhq cgwsbiep | GRP_0 |
| 4495 | job SID_37hoti failed in job_scheduler at: 09/13/2016 19:53:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_37hoti failed in job_scheduler at: 09/13/2016 19:53:00 | bpctwhsn kzqsbmtp | GRP_5 |
| 4530 | blank call | blank call | fumkcsji sarmtlhy | GRP_0 |
| 4550 | call disconnected due to vpn disconnection | call disconnected due to vpn disconnection | rbozivdq gmlhrtvp | GRP_0 |
| 4704 | private address fields are enabled on employee master crm ui | disable private address fields, new & edit buttons on employee master crm ui | tavsikpl dcrkwuny | GRP_15 |
| 4881 | install company barcode für ewew8323504 \vzqomdgt jwoqbuml | install company barcode für ewew8323504 \vzqomdgt jwoqbuml | vzqomdgt jwoqbuml | GRP_24 |
| 4984 | reset passwords for cubdsrml znewqgop using password_management_tool password reset. | the | cubdsrml znewqgop | GRP_17 |
| 4991 | reset passwords for davidthd robankm using password_management_tool password reset. | the | zelunfcq yimdwjrp | GRP_17 |
| 5212 | blank call | blank call | fumkcsji sarmtlhy | GRP_0 |
| 5226 | blank call | blank call | olckhmvx pcqobjnd | GRP_0 |
| 5317 | reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. | the | bxeagsmt zrwdgsco | GRP_17 |
| 5488 | job SID_38hotf failed in job_scheduler at: 09/01/2016 22:20:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_38hotf failed in job_scheduler at: 09/01/2016 22:20:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 5521 | blank call //gso | blank call //gso | rbozivdq gmlhrtvp | GRP_0 |
| 5708 | reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. | the | bxeagsmt zrwdgsco | GRP_17 |
| 5884 | reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. | the | bxeagsmt zrwdgsco | GRP_17 |
| 5928 | ticket update on inplant_855239 | ticket update on inplant_855239 | fumkcsji sarmtlhy | GRP_0 |
| 5945 | blank call //gso | blank call //gso | rbozivdq gmlhrtvp | GRP_0 |
| 6058 | reset passwords for bxeagsmt zrwdgsco using password_management_tool password reset. | the | bxeagsmt zrwdgsco | GRP_17 |
| 6130 | job Job_749 failed in job_scheduler at: 08/27/2016 15:29:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_749 failed in job_scheduler at: 08/27/2016 15:29:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6141 | job Job_1989 failed in job_scheduler at: 08/27/2016 11:24:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1989 failed in job_scheduler at: 08/27/2016 11:24:00 | bpctwhsn kzqsbmtp | GRP_6 |
| 6252 | job Job_3028 failed in job_scheduler at: 08/26/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/26/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6260 | job Job_3028 failed in job_scheduler at: 08/25/2016 22:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/25/2016 22:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6265 | job pp_EU_tool_netch_ap1 failed in job_scheduler at: 08/25/2016 16:14:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob pp_EU_tool_netch_ap1 failed in job_scheduler at: 08/25/2016 16:14:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6321 | job Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 | bpctwhsn kzqsbmtp | GRP_60 |
| 6323 | job Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_1314 failed in job_scheduler at: 08/25/2016 08:15:00 | bpctwhsn kzqsbmtp | GRP_60 |
| 6340 | probleme mit erpgui \vsdtxwry ngkcdjye | probleme mit erpgui \vsdtxwry ngkcdjye | vsdtxwry ngkcdjye | GRP_24 |
| 6411 | svc-now ticket found... doing nothing | received from: monitoring_tool@company.com_x000D_\n_x000D_\nsvc-now ticket found... doing nothing | bpctwhsn kzqsbmtp | GRP_60 |
| 6412 | svc-now ticket found... doing nothing | received from: monitoring_tool@company.com_x000D_\n_x000D_\nsvc-now ticket found... doing nothing | bpctwhsn kzqsbmtp | GRP_60 |
| 6471 | job SID_41arc2 failed in job_scheduler at: 08/24/2016 07:04:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_41arc2 failed in job_scheduler at: 08/24/2016 07:04:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6485 | job SID_31arc2 failed in job_scheduler at: 08/24/2016 06:09:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob SID_31arc2 failed in job_scheduler at: 08/24/2016 06:09:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6521 | job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6522 | job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6523 | job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6524 | job Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/24/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6603 | account unlock | account unlock | jusenflm sufbehom | GRP_0 |
| 6659 | job Job_3028 failed in job_scheduler at: 08/23/2016 00:02:00 | received from: monitoring_tool@company.com_x000D_\n_x000D_\njob Job_3028 failed in job_scheduler at: 08/23/2016 00:02:00 | bpctwhsn kzqsbmtp | GRP_8 |
| 6739 | blank call // gso | blank call // gso | rbozivdq gmlhrtvp | GRP_0 |
| 6819 | reset passwords for wvdxnkhf jirecvta using password_management_tool password reset. | the | wvdxnkhf jirecvta | GRP_17 |
| 6942 | call came and got disconnected | call came and got disconnected | rbozivdq gmlhrtvp | GRP_0 |
| 6992 | probleme mit erpgui \tmqfjard qzhgdoua | probleme mit erpgui \tmqfjard qzhgdoua | tmqfjard qzhgdoua | GRP_24 |
| 7034 | blank call | blank call | fumkcsji sarmtlhy | GRP_0 |
| 7132 | reset passwords for ezrsdgfc hofgvwel using password_management_tool password reset. | the | ezrsdgfc hofgvwel | GRP_17 |
| 7459 | account locked in ad | account locked in ad | upiyobvj lwohuizr | GRP_0 |
| 7756 | german call | german call | rbozivdq gmlhrtvp | GRP_0 |
| 7772 | blank call // loud noise | blank call // loud noise | rbozivdq gmlhrtvp | GRP_0 |
| 7836 | probleme mit erpgui \tmqfjard qzhgdoua | probleme mit erpgui \tmqfjard qzhgdoua | tmqfjard qzhgdoua | GRP_24 |
| 8051 | issue on pricing in distributor_tool | we have agreed price with many of the distributors for a given period and skus. this is specified through pricing condition zcnc in erp. in distributor_tool, it is order through sold to & ship to combination._x000D_\n- till august – flat rate deployment, we did not have any issues._x000D_\n- today, when the distributor tried to book the order with zcnc pricing condition. the initial screen shows the correct price. but when the item is selected and quick order is clicked on, the price is getting changed to list price less standard discount instead of retaining the zcnc price. | hbmwlprq ilfvyodx | GRP_21 |
| 8093 | reset passwords for prgthyuulla ramdntythanjesh using password_management_tool password reset. | the | boirqctx bkijgqry | GRP_17 |
| 8347 | blank call // loud noise | blank call // loud noise | rbozivdq gmlhrtvp | GRP_0 |
| 8405 | unable to launch outlook | unable to launch outlook | wjtzrmqc ikqpbflg | GRP_0 |
#drop NaNs and duplicate rows
dfs.dropna(inplace=True)
dfs.shape
(8491, 4)
dfs.drop_duplicates(subset = None, keep = 'first',inplace=True)
dfs.shape
(8408, 4)
#preprocess and clean the data
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
#define a function for preprocessing the data
def preprocess_text(df, column_name=''):
# Remove email Ids
df[column_name] = df[column_name].apply(lambda x: re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,4}',' ',x))
# Remove label url link
df[column_name]=df[column_name].apply(lambda x: re.sub(r'urlLink|urllink','',x))
# remove all the places where that starts with http or https
df[column_name]=df[column_name].apply(lambda x: re.sub(r'https?\S+','',x))
# Strip unwanted spaces
df[column_name] = df[column_name].apply(lambda x: x.strip())
# Select only alphabets
df[column_name] = df[column_name].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))
# Convert text to lowercase
df[column_name] = df[column_name].apply(lambda x: x.lower())
# Remove stopwords
df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
#Remove hello
df[column_name] = df[column_name].apply(lambda x: re.sub(r'hello ', ' ', x))
# Replace empty strings with Null
df[column_name].replace('', np.nan, inplace = True)
# Drop Null values
df = df.dropna()
return df
[nltk_data] Downloading package stopwords to [nltk_data] /Users/dgupta3/nltk_data... [nltk_data] Package stopwords is already up-to-date!
dfs = preprocess_text(dfs, column_name='Description')
dfs = preprocess_text(dfs, column_name='Short description')
dfs.head(20)
| Short description | Description | Caller | Assignment group | |
|---|---|---|---|---|
| 0 | login issue | verified user details employee manager name x checked user name ad reset password x advised user login check x caller confirmed able login x issue resolved | spxjnwir pjlcoqds | GRP_0 |
| 1 | outlook | x x received x x team x x meetings skype meetings etc appearing outlook calendar somebody please advise correct x x kind | hmjdrvpb komuaywn | GRP_0 |
| 2 | cant log vpn | x x received x x hi x x cannot log vpn x x best | eylqgodm ybqkwiam | GRP_0 |
| 3 | unable access hr tool page | unable access hr tool page | xbkucsvz gcpydteq | GRP_0 |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 |
| 5 | unable log engineering tool skype | unable log engineering tool skype | eflahbxn ltdgrvkz | GRP_0 |
| 6 | event critical hostname company com value mountpoint threshold oracle sid erpdata | event critical hostname company com value mountpoint threshold oracle sid erpdata sr psa sr psa data perpsr psa | jyoqwxhz clhxsoqy | GRP_1 |
| 7 | ticket employment status new non employee enter user name | ticket employment status new non employee enter user name | eqzibjhw ymebpoih | GRP_0 |
| 8 | unable disable add ins outlook | unable disable add ins outlook | mdbegvct dbvichlg | GRP_0 |
| 9 | ticket update inplant | ticket update inplant | fumkcsji sarmtlhy | GRP_0 |
| 10 | engineering tool says connected unable submit reports | engineering tool says connected unable submit reports | badgknqs xwelumfz | GRP_0 |
| 11 | hr tool site loading page correctly | hr tool site loading page correctly | dcqsolkx kmsijcuz | GRP_0 |
| 12 | unable login hr tool sgxqsuojr xwbesorf cards | unable login hr tool sgxqsuojr xwbesorf cards | oblekmrw qltgvspb | GRP_0 |
| 13 | user wants reset password | user wants reset password | iftldbmu fujslwby | GRP_0 |
| 14 | unable open payslips | unable open payslips | epwyvjsz najukwho | GRP_0 |
| 15 | ticket update inplant | ticket update inplant | fumkcsji sarmtlhy | GRP_0 |
| 16 | unable login company vpn | received hi unable login company vpn website trying open new session using link able get pls help urgently working home tomorrow due month end closing | chobktqj qdamxfuc | GRP_0 |
| 17 | undocking pc screen come back | undocking pc screen come back | sigfdwcj reofwzlm | GRP_3 |
| 18 | erp sid account locked | erp sid account locked | nqdyowsm yqerwtna | GRP_0 |
| 19 | unable sign vpn | unable sign vpn | ftsqkvre bqzrupic | GRP_0 |
#check the distribution of words in each sentence
word_count_SD = dfs['Short description'].apply(lambda x: len(x.split()))
sns.displot(word_count_SD)
<seaborn.axisgrid.FacetGrid at 0x7fcca844d130>
word_count_Des = dfs['Description'].apply(lambda x: len(x.split()))
sns.displot(word_count_Des)
<seaborn.axisgrid.FacetGrid at 0x7fccc868a6d0>
!pip install wordcloud
import heapq
from wordcloud import WordCloud
## function to create Word Cloud
def show_wordcloud(data, title):
wordcloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words=100,
max_font_size=40,
scale=3,
random_state=1
).generate(str(data))
type(wordcloud.words_.keys())
#print("Top words are: ", heapq.nlargest(100, wordcloud.words_.keys()))
print("Top words are: ", list(wordcloud.words_.keys())[0: 10])
fig = plt.figure(1, figsize=(15, 12))
plt.axis('off')
if title:
fig.suptitle(title, fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(wordcloud)
plt.show()
Requirement already satisfied: wordcloud in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (1.8.2.2) Requirement already satisfied: matplotlib in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (3.5.1) Requirement already satisfied: pillow in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (9.0.1) Requirement already satisfied: numpy>=1.6.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from wordcloud) (1.21.5) Requirement already satisfied: kiwisolver>=1.0.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (1.3.2) Requirement already satisfied: fonttools>=4.22.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (4.25.0) Requirement already satisfied: python-dateutil>=2.7 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: packaging>=20.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (21.3) Requirement already satisfied: pyparsing>=2.2.1 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (3.0.4) Requirement already satisfied: cycler>=0.10 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from matplotlib->wordcloud) (0.11.0) Requirement already satisfied: six>=1.5 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
topGroups = dfs["Assignment group"].value_counts().head(10)
for items in topGroups.iteritems():
groupName = items[0]
text_Str1 = dfs['Short description'][dfs['Assignment group'].isin([groupName])].tolist()
show_wordcloud(text_Str1, groupName + " WORD CLOUD" )
print("\n")
Top words are: ['erp', "password reset'", 'outlook', 'erp sid', "account locked'", 'ticket update', 'password reset', "issue'", 'access', 'user']
Top words are: ['failed job', "job scheduler'", "scheduler' job", 'job failed', 'network outage', 'company', "since et'", 'job scheduler', 'circuit outage', 'abended job']
Top words are: ['probleme mit', 'f r', 'setup new', 'new ws', 'ewew', 'rechner', 'install', 'defekt', 'eu tool', 'drucker']
Top words are: ['hostname', 'server', 'access', "hostname'", 'space consumed', 'consumed space', 'disk free', 'file', 'folder', 'average samples']
Top words are: ['failed job', "job scheduler'", 'job failed', "scheduler' job", 'abended job', 'job scheduler', "scheduler job'", "job' abended", 'bobj', 'report']
Top words are: ['user', 'account', 'erp access', 'sid', 'need', 'erp', 'security incidents', 'please', "access issue'", 'password']
Top words are: ['laptop', "issue'", "working'", 'unable', 'printer', 'need', 'new', "laptop'", 'system', 'id']
Top words are: ['pc', 'cannot', 'monitor', 'print', 'printer', 'need', 'new', 'get', 'unable', 'client']
Top words are: ['failed job', "job scheduler'", "scheduler' job", 'job failed', 'job scheduler', 'abended job', 'snp heu', "scheduler job'", 'create delivery', "job' abended"]
Top words are: ['inwarehouse tool', 'order', 'erp', 'sale', 'customer', 'unable', 'please', 'error', 'mm', 'price']
for items in topGroups.iteritems():
groupName = items[0]
text_Str1 = dfs['Description'][dfs['Assignment group'].isin([groupName])].tolist()
show_wordcloud(text_Str1, groupName + " WORD CLOUD" )
print("\n")
Top words are: ['x x', 'x received', 'received x', "x x'", 'outlook', 'erp sid', 'access', 'need', "password reset'", 'x please']
Top words are: ['x x', 'yes na', 'na x', 'backup circuit', 'received x', 'x job', 'failed job', "job scheduler'", 'telecom vendor', 'na yes']
Top words are: ['probleme mit', 'f r', 'ewew', 'setup new', 'new ws', 'x x', 'install', 'und', 'rechner', 'defekt']
Top words are: ['x x', 'hostname', 'server', 'asa deny', 'deny tcp', 'tcp src', 'src inside', 'inside dst', 'dst outside', 'outside access']
Top words are: ['x x', 'received x', 'job job', 'x job', 'failed job', "job scheduler'", 'job failed', "scheduler' received", 'report', 'x abended']
Top words are: ['x x', 'f e', 'e f', 'e e', 'sid sid', 'f f', 'x event', 'e c', 'asa deny', 'deny tcp']
Top words are: ['x x', 'laptop', 'x received', 'please', 'received x', 'system', 'x sep', 'issue', 'working', 'name']
Top words are: ['x x', 'x', 'need', 'pc', 'x sep', 'sep asa', 'inside x', 'user', 'please', 'printer']
Top words are: ['x x', 'received x', 'job job', 'failed job', "job scheduler'", "scheduler' received", 'job failed', 'plant', 'x abended', 'abended job']
Top words are: ['x x', 'inwarehouse tool', 'order', 'customer', 'item', 'erp', 'x please', 'issue', 'received x', "x x'"]
!pip install fasttext
import fasttext
lang_model = fasttext.load_model('lid.176.bin')
sentences = ['unable check payslips']
predictions = lang_model.predict(sentences)
print(predictions[0])
#this was detected as french: [['__label__fr']]
def language_detect(sentences):
return lang_model.predict(sentences)[0]
dfs['language'] = dfs['Short description'].apply(language_detect)
#Language detection did not work using below
#pre-trained models lid.176.bin(126 MB) and lid.176.ftz(917kb)
Requirement already satisfied: fasttext in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (0.9.2) Requirement already satisfied: numpy in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (1.21.5) Requirement already satisfied: pybind11>=2.2 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (2.10.0) Requirement already satisfied: setuptools>=0.7.0 in /Users/dgupta3/opt/anaconda3/lib/python3.9/site-packages (from fasttext) (61.2.0) [['__label__fr']]
Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
dfs.head(20)
| Short description | Description | Caller | Assignment group | language | |
|---|---|---|---|---|---|
| 0 | login issue | verified user details employee manager name x checked user name ad reset password x advised user login check x caller confirmed able login x issue resolved | spxjnwir pjlcoqds | GRP_0 | (__label__en,) |
| 1 | outlook | x x received x x team x x meetings skype meetings etc appearing outlook calendar somebody please advise correct x x kind | hmjdrvpb komuaywn | GRP_0 | (__label__en,) |
| 2 | cant log vpn | x x received x x hi x x cannot log vpn x x best | eylqgodm ybqkwiam | GRP_0 | (__label__en,) |
| 3 | unable access hr tool page | unable access hr tool page | xbkucsvz gcpydteq | GRP_0 | (__label__en,) |
| 4 | skype error | skype error | owlgqjme qhcozdfx | GRP_0 | (__label__ja,) |
| 5 | unable log engineering tool skype | unable log engineering tool skype | eflahbxn ltdgrvkz | GRP_0 | (__label__en,) |
| 6 | event critical hostname company com value mountpoint threshold oracle sid erpdata | event critical hostname company com value mountpoint threshold oracle sid erpdata sr psa sr psa data perpsr psa | jyoqwxhz clhxsoqy | GRP_1 | (__label__en,) |
| 7 | ticket employment status new non employee enter user name | ticket employment status new non employee enter user name | eqzibjhw ymebpoih | GRP_0 | (__label__en,) |
| 8 | unable disable add ins outlook | unable disable add ins outlook | mdbegvct dbvichlg | GRP_0 | (__label__en,) |
| 9 | ticket update inplant | ticket update inplant | fumkcsji sarmtlhy | GRP_0 | (__label__en,) |
| 10 | engineering tool says connected unable submit reports | engineering tool says connected unable submit reports | badgknqs xwelumfz | GRP_0 | (__label__en,) |
| 11 | hr tool site loading page correctly | hr tool site loading page correctly | dcqsolkx kmsijcuz | GRP_0 | (__label__en,) |
| 12 | unable login hr tool sgxqsuojr xwbesorf cards | unable login hr tool sgxqsuojr xwbesorf cards | oblekmrw qltgvspb | GRP_0 | (__label__en,) |
| 13 | user wants reset password | user wants reset password | iftldbmu fujslwby | GRP_0 | (__label__en,) |
| 14 | unable open payslips | unable open payslips | epwyvjsz najukwho | GRP_0 | (__label__fr,) |
| 15 | ticket update inplant | ticket update inplant | fumkcsji sarmtlhy | GRP_0 | (__label__en,) |
| 16 | unable login company vpn | received hi unable login company vpn website trying open new session using link able get pls help urgently working home tomorrow due month end closing | chobktqj qdamxfuc | GRP_0 | (__label__en,) |
| 17 | undocking pc screen come back | undocking pc screen come back | sigfdwcj reofwzlm | GRP_3 | (__label__en,) |
| 18 | erp sid account locked | erp sid account locked | nqdyowsm yqerwtna | GRP_0 | (__label__en,) |
| 19 | unable sign vpn | unable sign vpn | ftsqkvre bqzrupic | GRP_0 | (__label__en,) |